"""
#
# Flow stability for dynamic community detection https://arxiv.org/abs/2101.06131v2
#
# Copyright (C) 2021 Alexandre Bovet <alexandre.bovet@maths.ox.ac.uk>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

This script extracts countries from author affiliations of the APS dataset.

creates the files `doi_to_affilitations_authors_dict.json` and 
                  `affiliations_to_countries.json`

"""

import json

from collections import Counter
import pandas as pd



#%% first get doi and date for each paper

df_doi_dates = pd.read_csv('../data/aps/df_doi_dates.csv.gz', index_col=0)

# this dataframe contains the location of the json file corresponding to each
# APS article from the aps-dataset-metadata-2018 archive that can be obtained from the APS

#%% load edge_list (papers with from 2 to 10 authors)

# all edges in the temporal network from 1893-07-01 to 2009-12-31
df_edges = pd.read_csv('../data/aps/all_journals_disamb_edges.csv.gz', index_col=0)

#%%
# add institution info
files = df_doi_dates.file

affilitations = {}
for i,doi in enumerate(df_edges.doi.unique()):
    if doi not in affilitations.keys():
        if not i%1000:
            print(i, ' over 284214')
        f = files.loc[files.str.endswith(doi.split('/')[1] + '.json')]
        
        assert f.size == 1
        
        with open(f.values[0]) as fopen:
            metadata = json.load(fopen)
        
        affilitations[doi] = {}
        affilitations[doi]['affiliations'] = metadata['affiliations']
        affilitations[doi]['authors'] = metadata['authors']

#%%
with open('../data/aps/doi_to_affilitations_authors_dict.json', 'w') as fopen:
    
    json.dump(affilitations, fopen)


#%% group affiliations and extract locations

affil_set = set()
for val in affilitations.values():
    for aff in val['affiliations']:
        affil_set.add(aff['name'])
        
    
affil_set = list(affil_set)

#%%make trigram list
import nltk
from nltk.tokenize import word_tokenize
trigram_list = []
bigram_list = []
unigram_list = []


for i,aff in enumerate(affil_set):
    if not i%1000:
        print(i, len(affil_set))
    trigram_list.extend(list(nltk.trigrams(word_tokenize(aff))))
    bigram_list.extend(list(nltk.bigrams(word_tokenize(aff))))
    unigram_list.extend(list(word_tokenize(aff)))
    
    

trigram_counter = Counter(trigram_list)
bigram_counter = Counter(bigram_list)
unigram_counter = Counter(unigram_list)


universities = [(k,v) for k,v in trigram_counter.items() \
                if 'Uni' in k[0] or 'Uni' in k[1] or 'Uni' in k[2]]
    
universities = sorted(universities, key=lambda x: x[1], reverse=True)
#%% custom bigrams

country_bigrams = {}

country_bigrams['France'] = [(',', 'France'),('Grenoble', ','),('CNRS', ','),
                             ('Paris', 'Cedex'),('91405', 'Orsay'),('Grenoble', 'Cedex'),
                             ('Paris-Sud', ','),
                             ('French', 'Polynesia'),('Polynésie', 'Française'),
                             ('French', 'Guyana')]
country_bigrams['Germany'] = [('of', 'Germany'),('Berlin', ','),
                              ('Max-Planck-Institut', 'für'),('West', 'Germany'),
                              ('Karlsruhe', ','),
                              ('Universität', 'Berlin'),
                              ('Universität', 'Stuttgart'),
                              ('Universität', 'Heidelberg'),
                              ('Universität', 'Karlsruhe'),
                              ('Ruhr-Universität', 'Bochum'),
                              ('Universität', 'Hamburg'),
                              ('Universität', 'Bremen'),
                              ('Universität', 'Dortmund'),
                              ('Universität', 'Innsbruck'),
                              ('Universität', 'Potsdam'),
                              ('Universität', 'Tübingen'),
                              ('Universität', 'Marburg'),
                              ('Philipps-Universität', 'Marburg'),
                              (',','Philipps-Universität'),
                              (',', 'Otto-von-Guericke-Universität'),
                              ('München', 'EU'),
                              ('14195', 'Berlin'),
                              (',','Germany'),
                              (',','Deutschland'),
                              ('Jülich','Aachen'),
                              (',', 'Juelich-Aachen'),
                              ('Forschungszentrum','Jülich'),
                              ('Deutschen','Bundespost'),]
country_bigrams['USA'] = [(',', 'USA'), ('of', 'California'), ('California', ','),
                          (',', 'Massachusetts'), (',', 'Massachusettes'), (',', 'Illinois'),'Texas', ('New', 'York'),
                          (',', 'Berkeley'),('Berkeley', ','),('Chicago', ','),('New', 'Mexico'),
                          (',', 'Pennsylvania'),(',', 'Maryland'),(',', 'Washington'),
                          (',', 'Michigan'),('Washington', ','),(',', 'Princeton'),
                          (',', 'Stanford'),(',', 'Florida'),('Massachusetts', 'Institute'),
                          ('North', 'Carolina'),('Los', 'Angeles'),
                          ('Stanford', 'University'),
                          ('Cornell', 'University'),
                          ('Princeton', 'University'),
                          ('Harvard', 'University'),
                          ('Harvard', 'College'),
                          ('Yale', 'University'),
                          ('Rutgers', 'University'),
                          ('Vanderbilt', 'University'),
                          ('Boston', 'University'),
                          ('Columbia', 'University'),
                          ('Ohio', 'University'),
                          ('Clark', 'University'),
                          ('Duke', 'University'),
                          ('Clemson', 'University'),
                          ('Harvard-Smithsonian', 'Center'),
                          (',', 'MIT'),
                          ('Cambridge', 'Massachussetts'),
                          (',', 'California'),
                          (',', 'Texas'),
                          (',', 'Ohio'),
                          (',', 'Arizona'),
                          ('Purdue', 'University'),
                          ('Brown', 'University'),
                          ('Tufts', 'University'),
                          ('Syracuse', 'University'),
                          ('Georgia', '30602'),
                          ('30602', 'USA'),
                          (',', 'Indiana'),
                          (',', 'Colorado'),
                          (',', 'Missouri'),
                          (',', 'Fermilab'),
                          (',', 'Alabama'),
                          (',', 'Hawaii'),
                          (',', 'Minnesota'),
                          (',', 'Connecticut'),
                          (',', 'Minnesota'),
                          (',', 'Nebraska'),
                          (',', 'Oregon'),
                          (',', 'Vermont'),
                          (',', 'Montana'),
                          (',', 'Utah'),
                          (',', 'Virginia'),
                          (',', 'Nevada'),
                          (',', 'Maine'),
                          ('Southern', 'Mississippi'),
                          ('American', 'Telephone'),
                          ('Rhode', 'Island'),
                          ('South', 'Dakota'),
                          (',', 'Wisconsin'),
                          (',', 'Wyoming'),
                          (',', 'Mississippi'),
                          (',', 'Arkansas'),
                          (',', 'Oklahoma'),
                          (',', 'Tennessee'),
                          (',', 'Delaware'),
                          ('North', 'Dakota'),
                          (',', 'Louisiana'),
                          (',', 'Louisana'),
                          (',', 'Idaho'),
                          (',', 'Massachussets'),
                          (',', 'Oklahoma'),
                          (',', 'Kentucky'),
                          ('South', 'Carolina'),
                          ('North', 'Carolina'),
                          ('West', 'Virginia'),
                          ('East', 'Virginia'),
                          (',', 'Kansas'),
                          ('Boston', 'Massachusetts'),
                          (',', 'Iowa'),
                          ('New', 'Hampshire'),
                          (',', 'Newark'),
                          (',', 'Alaska'),
                          (',', 'Washnigton'),
                          (',', 'Pennyslvania'),
                          ('Austin', 'Texas'),
                          ('Alabama', 'Polytechnic'),
                          ('Colgate', 'University'),
                          ('Lehigh', 'University'),
                          ('Dartmouth', 'College'),
                          ('Mellon', 'Institute'),
                          ('Beckman', 'Institute'),
                          ('Westinghouse', 'Electric'),
                          ('Westinghouse', 'E.'),
                          ('Mendenhall', 'Laboratory'),
                          ('Ginzton', 'Laboratory'),
                          ('Smith', 'College'),
                          ('Allegheny', 'College'),
                          ('Goucher', 'College'),
                          ('California', 'Institute'),
                          ('Loomis', 'Laboratory'),]


country_bigrams['Italy'] = [(',', 'Italy'), ('Milano', ','),('di', 'Roma'),
                            ('Pisa', ','),('La', 'Sapienza'),
                            (',', 'Rome'),
                            ('Roma', 'ITALIA'),
                            ('di', 'Firenze'),]
country_bigrams['Russia'] = [(',', 'Russia'),(',','Moscow'),('Moscow', ','),('Russian', 'Academy'),
                             (',', 'Russian')]
country_bigrams['Brazil'] = [(',', 'Brazil'), (',', 'Brasil'), ('SP', 'Brasil')]
country_bigrams['Canada'] = [(',', 'Canada'),
                             (',', 'CANADA'),('McGill', 'University'),
                             ('British', 'Columbia'),
                             ('British', 'Colombia'),
                             ('Canadian', 'Institute'),
                             ('Dalhousie', 'University'),
                             ('McMaster', 'University'),
                             ('Université', 'Laval'),
                             (',', 'Québec'),
                             ('Perimeter', 'Institute'),
                             (',', 'Montreal')]
country_bigrams['Japan'] = [(',', 'Japan'), (',', 'Tokyo'),
                            ('Tokyo', ','),
                            ('Tôkyô', ','),
                            (',', 'Tôkyô'),
                            ('Tohoku', 'University'),
                            ('Waseda', 'University'),
                            ('Tokyo', 'University'),
                            ('Osaka', 'University'),
                            ('Kyoto', 'University'),
                            ('Nagoya', 'University'),
                            (',', 'Kyoto'),
                            (',', 'JAPAN')]
country_bigrams['Spain'] = [(',', 'Spain'), ('Barcelona', ','),('de', 'Madrid'),
                            ('48080', 'Bilbo'),
                            ('de', 'Aragón')]
country_bigrams['The Netherlands'] = [('The', 'Netherlands'),
                                      (',', 'Netherlands'),]
country_bigrams['Switzerland'] = [(',', 'Switzerland'),('Zürich', ','),('Lausanne', ','),
                                  ('ETH', 'Zurich'),
                                  ('Confédération', 'Helvétique'),
                                  (',', 'EPFL'),
                                  ('Swiss', 'Institute'),
                                  ('de', 'Lausanne'),]
country_bigrams['United Kingdom'] = [('United', 'Kingdom'),
                                     (',', 'Oxford'),(',','England'),
                                     ('Northern', 'Ireland'),
                                     (',', 'UK'),(',', 'Scotland'),
                                     ('5NH', 'England'),
                                     (',', 'Wales'),
                                     ('Imperial', 'College'),
                                     ('Lancaster', 'University'),
                                     ('Cavendish', 'Laboratory'),
                                     ('Oxford', 'University'),]
country_bigrams['China'] = [('of', 'China'), (',', 'China'), (',', 'Beijing'),
                            ('Chinese', 'Academy'),
                            ('Peking', 'University'),
                            ('Nanjing', 'University'),
                            ('Chinese', 'Center')]
country_bigrams['Korea'] = [(',', 'Korea'),(',', 'Seoul'),
                            ('South', 'Korea'),]
country_bigrams['Argentina'] = [(',', 'Argentina'),
                                ('Santa', 'FeArgentina'),]
country_bigrams['Belgium'] = [(',', 'Belgium'), (',', 'Belgique'), (',', 'België')]
country_bigrams['Australia'] = [(',', 'Australia'),(',', 'Austrialia'),
                                ('Murdoch', 'University')]
country_bigrams['Israel'] = [(',', 'Israel'),('Minerva', 'Center'),
                             ('Tel-Aviv', 'University'),
                             ('Ben-Gurion', 'University'),
                             (',', 'Beer-Sheva'),
                             ('Pearlstone', 'Center'),]
country_bigrams['Sweden'] = [(',', 'Sweden'),('Uppsala', 'University')]
country_bigrams['Poland'] = [(',', 'Poland'),('Polish', 'Academy'),
                             ('Warsaw', 'University'),]
country_bigrams['Finland'] = [(',', 'Finland')]
country_bigrams['Portugal'] = [(',', 'Portugal')]
country_bigrams['Hong Kong'] = [('Hong', 'Kong'),(',', 'Hong-Kong')]
country_bigrams['Austria'] = [(',', 'Austria'),
                              ('Universität', 'Wien'),]
country_bigrams['Denmark'] = [(',', 'Denmark'),('Aarhus', 'University')]
country_bigrams['Mexico'] = [(',', 'Mexico'),(',', 'México'),
                             (',', 'Cuernavaca'),
                             (',', 'Guanajuato')]
country_bigrams['Ukraine'] = [(',', 'Ukraine'),('of', 'Ukraine'),
                              ('83114', 'Ukraine'),('Ukrainian', 'SSR'),
                              (',', 'Ukraïna'),(',', 'Kharkow'),(',', 'Kharkov'),]
country_bigrams['Greece'] = [(',', 'Greece')]
country_bigrams['Hungary'] = [(',', 'Hungary'),
                              ('Hungarian', 'Academy'),
                              (',', 'Budapest')]
country_bigrams['Norway'] = [(',', 'Norway')]
country_bigrams['Slovenia'] = [(',', 'Slovenia')]
country_bigrams['Iran'] = [(',', 'Iran')]
country_bigrams['Turkey'] = [(',', 'Turkey'),(',', 'Türkiye')]
country_bigrams['South Africa'] = [('South', 'Africa')]
country_bigrams['Romania'] = [(',', 'Bucharest'),(',', 'Romania')]
country_bigrams['Taiwan'] = [(',', 'Taiwan'),(',', 'Chungli')]
country_bigrams['Yugoslavia'] = [(',', 'Yugoslavia')]
country_bigrams['Chile'] = [(',', 'Chile')]
country_bigrams['Ireland'] = [(',', 'Ireland'),(',', 'Dublin')]
country_bigrams['Venezuela'] = [(',', 'Venezuela'),('de', 'Venezuela')]
country_bigrams['Bulgaria'] = [(',', 'Bulgaria')]
country_bigrams['Slovakia'] = [(',', 'Slovakia')]
country_bigrams['Kuwait'] = [(',', 'Kuwait')]
country_bigrams['Uruguay'] = [(',', 'Uruguay')]
country_bigrams['Czech Republic'] = [(',', 'Czechoslovakia'),(',', 'Czechia'),
                                     ('Czeck', 'Republic'),('Czech', 'Republic'),
                                     (',', 'Česko')]
country_bigrams['India'] = [(',', 'Delhi'), (',', 'India'),
                            ('Indian', 'Institute'), ('Indian', 'Association'),
                            ('Indira', 'Gandhi'),
                            ('Bhabha', 'Atomic'),
                            ('Allahabod', 'University'),
                            ('Tata', 'Institute'),
                            (',', 'Allahabad'),]
country_bigrams['Uzbekistan'] = [(',', 'Uzbekistan')]
country_bigrams['Algérie'] = [(',', 'Algérie')]
country_bigrams['Bénin'] = [(',', 'Bénin')]
country_bigrams['Belarussia'] = [(',', 'Belarussia')]
country_bigrams['Syria'] = [(',', 'Syria')]
country_bigrams['USSR'] = [(',', 'USSR'),(',', 'U.S.S.R')]
country_bigrams['Cameroun'] = [(',', 'Cameroun')]
country_bigrams['Qatar'] = [(',', 'Qatar')]
country_bigrams['Usbekistan'] = [(',', 'Usbekistan')]
country_bigrams['Honduras'] = [(',', 'Honduras')]
country_bigrams['Nigeria'] = [(',', 'Nigeria')]
country_bigrams['Libya'] = [(',', 'Libya')]
country_bigrams['Crete'] = [(',', 'Crete')]
country_bigrams['Congo'] = [(',', 'Congo'),(',', 'Zaïre'),(',', 'Zaire')]
country_bigrams['Cyprus'] = [(',', 'Cyprus')]
country_bigrams['Tunisia'] = [(',', 'Tunisie'),(',', 'Tunisia')]
country_bigrams['Macedonia'] = [(',', 'Macedonia')]
country_bigrams['Kazhakhstan'] = [(',', 'Kazhakhstan')]
country_bigrams['Senegal'] = [(',', 'Senegal'),(',', 'Sénégal')]
country_bigrams['Guatemala'] = [(',', 'Guatemala')]
country_bigrams['Morroco'] = [(',', 'Morroco')]
country_bigrams['Trinidad and Tobago'] = [(',', 'Trinidad')]
country_bigrams['Ecuador'] = [(',', 'Ecuador')]


#%% trigrams

country_trigrams = {}

country_trigrams['China'] = [('Republic', 'of', 'China'),(',', 'Chinese', 'Academy'),
                             ('Huazhong', 'Normal', 'University'),
                             ('State', 'Key', 'Laboratory')]
country_trigrams['USA'] = [('University', 'of', 'California'),(',', 'New', 'Jersey'),
                           (',', 'Los', 'Alamos'),
                           ('Cambridge', ',', 'Massachusetts'),
                           ('Berkeley', ',', 'California'),
                           ('University', 'of', 'Illinois'),
                           ('Urbana', ',', 'Illinois'),
                           ('New', 'Mexico', '87545'),
                           ('Pittsburgh', ',', 'Pa'),
                           ('Los', 'Alamos', 'National'),
                           ('Massachusetts', 'Institute', 'of'),
                           ('Chicago', ',', 'Illinois'),
                           (',', 'North', 'Carolina'),
                           ('Santa', 'Barbara', ','),
                           ('Angeles', ',', 'California'),
                           ('University', 'of', 'Texas'),
                           ('University', 'of', 'Maryland'),
                           ('California', ',', 'Berkeley'),
                           ('Oak', 'Ridge', 'National'),
                           ('University', 'of', 'Chicago'),
                           ('Princeton', 'University', ','),
                           ('University', 'of', 'Michigan'),
                           ('Stony', 'Brook', ','),
                           ('Iowa', 'State', 'University'),
                           ('Argonne', ',', 'Illinois'),
                           ('California', 'Institute', 'of'),
                           ('Austin', ',', 'Texas'),
                           ('Argonne', 'National', 'Laboratory'),
                           ('University', ',', 'Ithaca'),
                           ('Ames', ',', 'Iowa'),
                           ('Pasadena', ',', 'California'),
                           ('Stanford', ',', 'California'),
                           ('Barbara', ',', 'California'),
                           ('Maryland', ',', 'College'),
                           ('Ithaca', ',', 'New'),
                           ('Iowa', 'State', 'University'),
                           ('University', 'of', 'Colorado'),
                           ('University', 'of', 'Pennsylvania'),
                           ('Pennsylvania', 'State', 'University'),
                           ('University', 'of', 'Washington'),
                           ('University', 'of', 'Minnesota'),
                           ('Florida', 'State', 'University'),
                           ('University', 'of', 'Wisconsin'),
                           ('University', 'of', 'Tennessee'),
                           ('University', 'of', 'Massachusetts'),
                           ('Michigan', 'State', 'University'),
                           ('University', 'of', 'Rochester'),
                           ('Ohio', 'State', 'University'),
                           ('University', 'of', 'Florida'),
                           ('Houston', ',', 'Texas'),
                           ('University', 'of', 'Virginia'),
                           ('University', 'of', 'Nebraska'),
                           ('University', 'of', 'Alabama'),
                           ('University', 'of', 'Missouri'),
                           ('University', 'of', 'Oregon'),
                           ('Indiana', 'University', 'Bloomington'),
                           ('Louisiana', 'State', 'University'),
                           ('Kansas', 'State', 'University'),
                           ('University', 'of', 'Iowa'),
                           ('University', ',', 'Nashville'),
                           ('University', 'of', 'Delaware'),
                           ('University', 'of', 'Arkansas'),
                           ('Athens', ',', 'Georgia'),
                           ('Boulder', ',', 'Colorado'),
                           ('Atlanta', ',', 'Georgia'),
                           ('Wilmington', ',', 'Delaware'),
                           (',', 'Bloomington'),
                           ('Louis', ',', 'Missouri'),
                           ('Fairfax',',', 'Virginia'),
                           ('University', 'of', 'Wisconsin-Madison'),
                           ('Virginia', 'Polytechnic', 'Institute'),
                           ('Virginia', 'State', 'University'),
                           (',', 'Rhode', 'Island'),
                           ('Johns', 'Hopkins', 'University'),
                           (',', 'N.', 'Y'),
                           ('University', 'of', 'Arizona'),
                           ('Livermore', 'National', 'Laboratory'),
                           ('University', 'of', 'Kansas'),
                           ('University', 'of', 'Oklahoma'),
                           ('Enrico', 'Fermi', 'Institute'),
                           ('United', 'Gas', 'Improvement'),
                           ('Wayne', 'State', 'University'),
                           ('Notre', 'Dame', 'Indiana'),
                           ('University', 'of', 'Pittsburgh'),
                           ('Westinghouse', 'Research', 'Laboratory'),
                           ('General', 'Electric', 'Company'),
                           ('Xerox', 'Webster', 'Research'),
                           ('Midwestern', 'Universities', 'Research'),
                           ('Ryerson', 'Physical', 'Laboratory'),
                           ('Norfolk', 'State', 'University'),
                           ('Bell', 'Telephone', 'Laboratories'),
                           ('George', 'Mason', 'University'),
                           ('Naval', 'Surface', 'Warfare'),
                           ('Naval', 'Research', 'Laboratory'),
                           ('Brookhaven', 'National', 'Laboratory'),
                           ('Coast', 'Artillery', 'Corps'),]
country_trigrams['Germany'] = [('Republic', 'of', 'Germany'),('Berlin', ',', 'Germany'),
                               ('Garching', ',', 'Germany'),
                               ('Dresden', ',', 'Germany'),
                               ('Universität', 'München', ','),
                               ('Technische', 'Universität', 'München'),
                               ('Frankfurt', 'am', 'Main'),
                               ('München', ',', 'Germany'),
                               ('Göttingen', ',', 'Germany'),
                               ('Aachen', ',', 'Germany')]
country_trigrams['Japan'] = [('University', 'of', 'Tokyo'),('Tohoku', 'University', ','),
                             ('Tsukuba', ',', 'Ibaraki'),(',', 'Osaka', 'University'),
                             (',', 'Tohoku', 'University'),
                             ('Osaka', 'University', ','),
                             ('University', 'of', 'Tsukuba'),
                             ('NTT', 'Advanced', 'Technology'),
                             ('Kagami', 'Memorial', 'Laboratory')]
                             
country_trigrams['Spain'] = [('Madrid', ',', 'Spain'),('Barcelona', ',', 'Spain'),
                             ('Universitat', 'de', 'Barcelona'),
                             ('Universidad', 'de', 'Zaragoza'),
                             ('Universidad', 'de', 'Huelva'),]
country_trigrams['Brazil'] = [('Rio','de','Janeiro'),(',', 'São', 'Paulo'),
                              (',', 'Sao', 'Paulo'),
                              ('Paulo', ',', 'Brazil'),
                              ('University', 'of', 'Brasilia'),
                              ('M.', 'G.', 'Brasil'),
                              ('S.', 'P.', 'Brasil')]
country_trigrams['France'] = [('Paris', 'Cedex', '05'),(',', 'CNRS', ','),
                              ('CEDEX', ',', 'France'),('Grenoble', ',', 'France'),
                              ('Université', 'Pierre', 'et'),
                              (',', 'Université', 'Paris'),
                              ('Orsay', 'Cedex', ','),('Grenoble', 'Cedex', '9'),
                              ('Gif-sur-Yvette', ',', 'France'),
                              ('place', 'Jussieu', ','),
                              ('Cédex', ',', 'France'),
                              ('Université', 'Paris-Sud', ','),
                              ('Université', 'Joseph', 'Fourier'),
                              ('Université', 'de', 'Picardie')]
                                                        
country_trigrams['Canada'] = [('Ontario', ',', 'Canada'),('British', 'Columbia', ','),
                              ('University', 'of', 'Toronto'),
                              ('Université', 'de', 'Montréal'),
                              ('University', 'of', 'Waterloo'),
                              ('University', 'of', 'Windsor'),
                              ('University', 'of', 'Alberta'),
                              ('Kingston', ',', 'Ontario'),
                              ('Windsor', ',', 'Ontario'),
                              ('Canadian', 'Australasian', 'Steamship')]
country_trigrams['Russia'] = [('Russian', 'Academy', 'of'),('Moscow', ',', 'Russia'),
                              (',', 'Russian', 'Academy'),
                              ('Moscow', 'State', 'University')]
country_trigrams['Italy'] = [('Milano', ',', 'Italy'),('Università', 'di', 'Roma'),
                             ('Pisa', ',', 'Italy'),
                              ('“', 'La', 'Sapienza'),('di', 'Roma', '“'),
                              ('Rome', ',', 'Italy'),
                              ('Università', 'di', 'Firenze')]
country_trigrams['Czech Republic'] = [(',', 'Czech', 'Republic')]
country_trigrams['Sweden'] = [('Stockholm', ',', 'Sweden'),
                              ('Chalmers', 'University', 'of')]

country_trigrams['United Kingdom'] = [(',', 'United', 'Kingdom'),
                                      (',', 'Imperial', 'College'),
                                      ('University', 'of', 'Cambridge'),
                                      ('University', 'of', 'Oxford'),
                                      ('University', 'of', 'Edinburgh'),
                                      ('University', 'of', 'Sussex'),
                                      ('Imperial', 'College', 'London'),
                                      ('Rutherford', 'Appleton', 'Laboratory'),
                                      ('Queen', 'Mary', 'College')]
country_trigrams['The Netherlands'] = [('Amsterdam', ',', 'The'),
                                       ('University', 'of', 'Groningen')]
country_trigrams['Poland'] = [('Warsaw', ',', 'Poland')]
country_trigrams['Switzerland'] = [('Lausanne', ',', 'Switzerland'),
                                   ('Zürich', ',', 'Switzerland'),
                                   ('Université', 'de', 'Genève'),
                                   ('Swiss', 'Federal', 'Institute'),
                                   ('Technischen', 'Hochschule', 'Zürich'),
                                   ('Ecole', 'Polytechnique', 'Fédérale')]
country_trigrams['Argentina'] = [('Aires', ',', 'Argentina'),
                                 ('Centro', 'Atómico', 'Bariloche')]
country_trigrams['Australia'] = [('Australian', 'National', 'University'),
                                 ('University', 'of', 'Queensland'),
                                 ('Australian', 'Research', 'Council'),
                                 ('New', 'South', 'Wales'),
                                 ('University', 'of', 'Melbourne'),
                                 ('Australasian', 'Steamship', 'Company'),]
country_trigrams['Finland'] = [('University', 'of', 'Helsinki'),
                               ('Helsinki', 'University', 'of')]
country_trigrams['Denmark'] = [('University', 'of', 'Denmark'),('Niels', 'Bohr', 'Institute')]
country_trigrams['Singapore'] = [('University', 'of', 'Singapore'),
                                 ('Nanyang', 'Technological', 'University')]
country_trigrams['Mexico'] = [('Autónoma', 'de', 'México')]
country_trigrams['Hungary'] = [('Budapest', ',', 'Hungary')]
country_trigrams['South Africa'] = [(',', 'South', 'Africa')]
country_trigrams['New Zealand'] = [(',', 'New', 'Zealand')]
country_trigrams['Croatia'] = [('University', 'of', 'Zagreb'),
                               ('Zagreb', ',', 'Croatia')]
country_trigrams['Taiwan'] = [('National', 'Taiwan', 'University'),
                              ('Chiao', 'Tung', 'University'),
                              ('Chang', 'Gung', 'College')]
country_trigrams['Latvia'] = [('University', 'of', 'Latvia')]
country_trigrams['USSR'] = [('Union', 'of', 'Soviet'),
                                    (',', 'U.S.S.R', '.')]
country_trigrams['Serbia'] = [('University', 'of', 'Belgrade'),
                              ('11001', 'Belgrade', ','),
                              ('Nikola', 'Tesla', 'University')]
country_trigrams['Armenia'] = [('Yerevan', 'State', 'University')]
country_trigrams['Chile'] = [('Universidad', 'de', 'Chile')]
country_trigrams['Georgia'] = [('Tbilisi','State','University'),
                               ('Tbilisi',',','Georgia'),
                               ('Republic', 'of', 'Georgia'),
                               (',', 'Georgian', 'Republic'),
                               ('Georgian', 'Academy', 'of'),
                               ('GE-0193',',', 'Georgia')]
country_trigrams['Bulgaria'] = [('Sofia', ',', 'Bulgaria'),
                                ('Bulgarian', 'Academy', 'of')]
country_trigrams['Romania'] = [('Bucharest', ',', 'Romania')]
country_trigrams['Korea'] = [('Republic', 'of', 'Korea'),('Seoul', 'National', 'University')]
country_trigrams['Slovakia'] = [('Slovak', 'Academy', 'of'),
                                ('Bratislava', ',', 'Slovakia')]
country_trigrams['Belgium'] = [('Libre', 'de', 'Bruxelles'),
                               ('Katholieke', 'Universiteit', 'Leuven'),
                               ('Katholike', 'Universiteit', 'Leuven'),
                               ('University', 'of', 'Antwerp'),
                               ('Catholique', 'de', 'Louvain'),
                               ('catholique', 'de', 'Louvain'),
                               ('(', 'LASMOS', ')'),]
country_trigrams['Portugal'] = [('Universidade', 'de', 'Lisboa')]
country_trigrams['Venezuela'] = [('Universidade', 'Simón', 'Bolívar')]
country_trigrams['Macedonia'] = [('Republic', 'of', 'Macedonia')]
country_trigrams['Palestine'] = [(',', 'West', 'Bank')]
country_trigrams['United Arab Republic'] = [('United', 'Arab', 'Republic')]
country_trigrams['Israel'] = [(',', 'Beer', 'Sheva'),
                              ('Tel', 'Aviv', 'University')]
country_trigrams['Macau'] = [('University', 'of', 'Macau')]


#%% quadrigrams
country_quadrigrams = {}
country_quadrigrams['USA'] = [('San', 'Francisco', 'State', 'University'),
                              ('American', 'Institute', 'of', 'Physics')]
country_quadrigrams['Canada'] = [('University', 'of', 'Western', 'Ontario')]
country_quadrigrams['Argentina'] = [('Universidad', 'de', 'Buenos', 'Aires')]
country_quadrigrams['Brazil'] = [('Universidade', 'Federal', 'do', 'Ceará')]
country_quadrigrams['Italy'] = [('Consiglio', 'Nazionale', 'delle', 'Ricerche')]

#%% custom regex expression
country_strings = {}
country_strings['Japan'] = [r'(?i)The Graduate University for Advanced Studies',r'(?i)Toyota',
                            r'(?i)(Japan)$',
                            r'(?i)Koto-ku',
                            r'(?i)Higashi',
                            r'(?i)(Shinonome)$']
country_strings['Spain'] = ['Catalana','Catalunya',
                            r'(?i)Universidad de Santiago de Compostela',
                            r'(?i)Consejo Superior de Investigaciones Científicas',
                            r'(?i)(Spain)$',r'(?i)(Sevilla)$','Donostia',
                            r'(?i)Universitat de les Illes Balears',
                            r'(?i)Universidad del País Vasco',
                            r'(i?)Instituto de Ciencia de Materiales',
                            'BIFI',
                            'Nicolás Cabrera']
country_strings['France'] = ['C.N.R.S','CNRS','CEA/Grenoble','Joseph Fourier',
                             r'(?i)Centre National de la Recherche Scientifique',
                             r'(?i)Futuroscope Cedex','Pierre et Marie Curie',
                             r'(Paris)$',r'(Saclay)$',r'(Nantes)$',
                             r'(i?)( Cedex \b\d+\b)',
                             r'(?i)(France)$',
                             r'(i?)Commissariat à l’Energie Atomique',
                             r"(i?)Commissariat `a l'Energie Atomique",
                             "Universitéd'Artois",
                             'Fr`eres Lumi`ere',
                             'Languedoc',
                             "Laboratoire de Physique Théorique de l'Ecole Normale Supérieure"]
country_strings['Canada'] = [r'(i?)Photoacoustic and Photothermal Sciences Laboratory, Department of Mechanical Engineering',
                             r'(i?)CIFAR Cosmology and Gravity',
                             r'(?i)(Canada)$',
                             r'(?i)Institut National de la Recherche Scientifique']
country_strings['Brazil'] = [r'(?i)(Brasil)$',
                             r'(i?)Universidade Federal do Rio Grande do Norte',
                             r'(i?)Universidade Federal do Rio Grande do Sul',
                             r'(i?)Universidade Federal de Minas Gerais',
                             r'(i?)Pontifícia Universidade Católica']
country_strings['Germany'] = [r'(i?)Martin-Luther-Universität',r'(Berlin)$',
                              r'(?i)(Germany)$',
                              r'(i?)Halle-Wittenburg',
                              r'(i?)Deutsches']
country_strings['USA'] = [r'(i?)Fermilab','U. S. Coast and Geodetic Survey','NASA',
                          r'(D\. ?C)$',r'(N\. ?Y)$',r'( Pa)$',
                          'Polytechnic Institute of Brooklyn',
                          r'( USA)$',
                          'Cornell',
                          'CIT-USC',
                          'ITAMP',
                          'JILA',
                          'Sound Laboratory Bureau of Standards',
                          r'^(Institute for Advanced Study)$',
                          r'^(Northwestern University)$',]
country_strings['India'] = [r'(i?)CSIR Centre for Mathematical Modelling and Computer Simulation',
                            r'(?i)(India)$']
country_strings['Italy'] = [r'( I-\b\d{2,}\b)','-CNR','CNR-','–CNR','CNR–',
                            '/CNR','CNR/',r'(i?)International School for Advanced Studies',
                            r'(?i)(Italy)$','INCM',
                            r'(i?)Istituto Nazionale per la Fisica della Materia',
                            r'(i?)Istituto Nazionale di Fisica della Materia',
                            r'(i?)Consorzio Nazionale Interuniversitario per le Scienze Fisiche della Materia',
                            "Universit`a di Camerino"]
                            
country_strings['Greece'] = ['F.O.R.T.H',
                             r'(?i)(Greece)$',
                             'Hellenic',
                             'Hellas']
country_strings['Argentina'] = [r'(i?)Comisión Nacional de Energía Atómica',
                             r'(?i)(Argentina)$',
                             r'(i?)Consejo Nacional de Investigaciones Científicas']
country_strings['USSR'] = ['U. S. S. R','U S S R','USSR','U.S.S.R']
country_strings['Russia'] = ['Leningrad']
country_strings['Switzerland'] = ['CERN',
                                  r'(?i)(Switzerland)$']
country_strings['United Kingdom'] = ['University College, Gower Street',
                                     r'(?i)(United Kingdom)$']
country_strings['Belgium'] = ['atholique du Louvain',
                              r'(?i)(Belgium)$']
country_strings['Macedonia'] = [r'(?i)(Macedonia)$']
country_strings['Venezuela'] = [r'(?i)(Venezuela)$']
country_strings['The Netherlands'] = ['COBRA','NIKHEF']
country_strings['China'] = ['JOINGC','National Laboratory of Semiconductor Superlattices and Related Microstructures']


#%% first, try to match with bigrams and trigrams
import regex as re
#reverse bi/trigrams dict

bigrams_to_country = {}
for k,v in country_bigrams.items():
    for b in v:
        bigrams_to_country[b] = k

trigrams_to_country = {}
for k,v in country_trigrams.items():
    for t in v:
        trigrams_to_country[t] = k
                
quadrigrams_to_country = {}
for k,v in country_quadrigrams.items():
    for t in v:
        quadrigrams_to_country[t] = k        

string_to_country = {}
for k,v in country_strings.items():
    for t in v:
        string_to_country[t] = k        

affil_to_country = {aff : [] for aff in affil_set}

for i, aff in enumerate(affil_set):
    if not i%1000:
        print(i, len(affil_set))
    tokens = word_tokenize(aff)
    trigrams = nltk.trigrams(tokens)
    for t in trigrams:
        if t in trigrams_to_country.keys():
            country = trigrams_to_country[t]
            if country not in affil_to_country[aff]:
                affil_to_country[aff].append(country)
                
    bigrams = nltk.bigrams(tokens)
    for b in bigrams:
        if b in bigrams_to_country.keys():
            country = bigrams_to_country[b]
            if country not in affil_to_country[aff]:
                affil_to_country[aff].append(country)
                
    # if nothing found, try quadrigrams and custom regex
    if len(affil_to_country[aff]) == 0:
        quadrigrams = nltk.ngrams(tokens,4)
        for q in quadrigrams:
            if q  in quadrigrams_to_country.keys():
                country = quadrigrams_to_country[q]
                if country not in affil_to_country[aff]:
                    affil_to_country[aff].append(country)
        
        for string in string_to_country.keys():
            if re.findall(string,aff.strip(',&. ')):
                affil_to_country[aff].append(string_to_country[string])
        
                    


    
#%%
unmatched_affil = [k for k,v in affil_to_country.items() if len(v)==0] 
ambiguous_affil = [(k,v) for k,v in affil_to_country.items() if len(v)>1]    
matched_affil = [(k,v) for k,v in affil_to_country.items() if len(v)==1]    

print(len(matched_affil)/len(affil_set))
print(len(ambiguous_affil)/len(affil_set))
print(len(unmatched_affil)/len(affil_set))
#%% use geotext on the unmatched


from flashgeotext.geotext import GeoText
geotext = GeoText()

affil_geotext = {}
for i,aff in enumerate(unmatched_affil):
    if not i%1000:
        print(i, len(unmatched_affil))
    p = geotext.extract(aff)
    
    affil_geotext[aff] = p
    
ambiguous_geotext = [(k,p) for k,p in affil_geotext.items() if len(p['countries']) > 1]
unmatched_geotext = [(k,p) for k,p in affil_geotext.items() if len(p['countries']) == 0]
matched_geotext = [(k,p) for k,p in affil_geotext.items() if len(p['countries']) == 1]



#%% now try to catch spelling mistakes
import jellyfish 

def gram_similarity(g1,g2):
    sim = 0
    for tk1,tk2 in zip(g1,g2):
         sim += jellyfish.jaro_winkler_similarity(tk1,tk2)
         
    return sim/len(g1)

thresh = 0.95

affil_to_country_fuzzy = {aff : [] for aff in [k for k,p in unmatched_geotext]}


for i,aff in enumerate([k[0] for k in unmatched_geotext]):
    if not i%1000:
        print(i, len(unmatched_geotext))
    tokens = word_tokenize(aff)
    trigrams = nltk.trigrams(tokens)
    for t1 in trigrams:
        for t2 in trigrams_to_country.keys():
            if gram_similarity(t1,t2) >= thresh:
                # print(t1,t2)
                country = trigrams_to_country[t2]
                if country not in affil_to_country_fuzzy[aff]:
                    affil_to_country_fuzzy[aff].append(country)
                
    bigrams = nltk.bigrams(tokens)
    for b1 in bigrams:
        for b2 in bigrams_to_country.keys():
            if gram_similarity(b1,b2) >= thresh:
                # print(b1,b2)
                country = bigrams_to_country[b2]
                if country not in affil_to_country_fuzzy[aff]:
                    affil_to_country_fuzzy[aff].append(country)

unmatched_affil_fuzzy = [k for k,v in affil_to_country_fuzzy.items() if len(v)==0] 
ambiguous_affil_fuzzy = [(k,v) for k,v in affil_to_country_fuzzy.items() if len(v)>1]    
matched_affil_fuzzy = [(k,v) for k,v in affil_to_country_fuzzy.items() if len(v)==1] 


#%% now use open street map to reverse locate countries from geotext cities
from geopy import geocoders
import time
geocoder = geocoders.Nominatim(user_agent='alex_app')



geocoder_cache = {}


affil_reversesearch = {aff : [] for aff in unmatched_affil_fuzzy}

for i,aff in enumerate(unmatched_affil_fuzzy):

    
    if not i%1:
        print(i, '\n"', aff, '"')
    p = geotext.extract(aff)
    
    cities = list(p['cities'].keys())
    locations = []
    for city in cities:
        if city != 'University' and city != 'Matera':
            if city not in geocoder_cache.keys():
                try:
                    print('searching ', city)
                    loc = geocoder.geocode(city, language='en')
                    time.sleep(1)
                
                    geocoder_cache[city] = loc
                    locations.append(loc)
                except Exception as e:
                    print(e)
                    locations.append('error')
                    pass
            else:
                locations.append(geocoder_cache[city])
    
    
    # also try with the last part of the affilitation    
    if aff[-1] != ',':
        # if it's not the first part of an address
    
        splits = aff.strip('.').split(',')
        
        if len(splits) > 1:
            # if it's not just the department name
            addr = splits[-1]
        
            if addr not in geocoder_cache.keys():
                try:
                    print('searching ', addr)
                    loc = geocoder.geocode(addr,
                                       language='en')
                    time.sleep(1)
                    geocoder_cache[addr] = loc
                except Exception as e:
                    print(e)
                    loc = 'error'
                    geocoder_cache[addr] = 'error'
                    pass
            else:
                loc = geocoder_cache[addr]
            
            if loc is not None and loc not in locations:
                locations.append(loc)
                # try with affiliation
                
    if not i%1:
        print(i, '\n', locations)
    
    affil_reversesearch[aff] = locations


ambiguous_reversesearch = [(k,p) for k,p in affil_reversesearch.items() if len(p) > 1]
unmatched_reversesearch= [(k,p) for k,p in affil_reversesearch.items() if len(p) == 0]
matched_reversesearch = [(k,p) for k,p in affil_reversesearch.items() if len(p) == 1]

#%% check if ambiguous is the same country
ambiguous_reversesearch_country = []
for k,l in ambiguous_reversesearch:
    locs = []
    for loc in l:
        country = loc.address.split(',')[-1].strip()
        if country not in locs:
            locs.append(country)
    ambiguous_reversesearch_country.append((k,locs))

ambiguous_reversesearch_country = [(k,l) for k,l in ambiguous_reversesearch_country if len(l)>1]

#%%  put together all the results


final_affil_to_country = {}


for aff, countries in affil_to_country.items():
    if len(countries)>0:
        final_affil_to_country[aff] = countries


for aff, p in affil_geotext.items():
    if len(p['countries']) > 0:
        assert aff not in final_affil_to_country
        final_affil_to_country[aff] = list(p['countries'].keys())

for aff, p in affil_geotext.items():
    if len(p['countries']) > 0:
        assert aff not in final_affil_to_country
        final_affil_to_country[aff] = list(p['countries'].keys())        
            
for aff, countries in affil_to_country_fuzzy.items():
    if len(countries) > 0:
        assert aff not in final_affil_to_country
        final_affil_to_country[aff] = countries

for aff, locs in affil_reversesearch.items():
    if len(locs)>0:
         assert aff not in final_affil_to_country
         final_affil_to_country[aff] = []
         for loc in locs:
             country = loc.address.split(',')[-1].strip()
             if country not in final_affil_to_country[aff]:
                final_affil_to_country[aff].append(country)
                

#%% remap country names
country_counter = Counter([c for val in final_affil_to_country.values() for c in val])                
                
#remap 
remapping = {}
remapping['United States of America'] = 'USA'
remapping['The Netherlands'] = 'Netherlands'
remapping['Cameroun'] = 'Cameroon'

def remapper(country):
    if country in remapping:
        return remapping[country]
    else:
        return country

final_affil_to_country = {aff: list(map(remapper,countries)) for aff,countries in \
                              final_affil_to_country.items()}


new_country_counter = Counter([c for val in final_affil_to_country.values() for c in val])                
  
#%% save data 

import json

with open('../paper_data/aps/data_share/affiliations_to_countries.json','w') as fopen:
    json.dump(final_affil_to_country, fopen)
    
